1   package org.apache.lucene.search;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Arrays;
23  import java.util.Comparator;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.LinkedHashMap;
27  
28  import org.apache.lucene.index.Term;
29  import org.apache.lucene.search.similarities.Similarity;
30  import org.apache.lucene.util.FixedBitSet;
31  
32  final class SloppyPhraseScorer extends Scorer {
33  
34    private final ConjunctionDISI conjunction;
35    private final PhrasePositions[] phrasePositions;
36  
37    private float sloppyFreq; //phrase frequency in current doc as computed by phraseFreq().
38  
39    private final Similarity.SimScorer docScorer;
40    
41    private final int slop;
42    private final int numPostings;
43    private final PhraseQueue pq; // for advancing min position
44    
45    private int end; // current largest phrase position  
46  
47    private boolean hasRpts; // flag indicating that there are repetitions (as checked in first candidate doc)
48    private boolean checkedRpts; // flag to only check for repetitions in first candidate doc
49    private boolean hasMultiTermRpts; //  
50    private PhrasePositions[][] rptGroups; // in each group are PPs that repeats each other (i.e. same term), sorted by (query) offset 
51    private PhrasePositions[] rptStack; // temporary stack for switching colliding repeating pps 
52    
53    private int numMatches;
54    final boolean needsScores;
55    private final float matchCost;
56    
57    SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
58        int slop, Similarity.SimScorer docScorer, boolean needsScores,
59        float matchCost) {
60      super(weight);
61      this.docScorer = docScorer;
62      this.needsScores = needsScores;
63      this.slop = slop;
64      this.numPostings = postings==null ? 0 : postings.length;
65      pq = new PhraseQueue(postings.length);
66      DocIdSetIterator[] iterators = new DocIdSetIterator[postings.length];
67      phrasePositions = new PhrasePositions[postings.length];
68      for (int i = 0; i < postings.length; ++i) {
69        iterators[i] = postings[i].postings;
70        phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
71      }
72      conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
73      this.matchCost = matchCost;
74    }
75  
76    /**
77     * Score a candidate doc for all slop-valid position-combinations (matches) 
78     * encountered while traversing/hopping the PhrasePositions.
79     * <br> The score contribution of a match depends on the distance: 
80     * <br> - highest score for distance=0 (exact match).
81     * <br> - score gets lower as distance gets higher.
82     * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice: 
83     * once for "a b" (distance=0), and once for "b a" (distance=2).
84     * <br>Possibly not all valid combinations are encountered, because for efficiency  
85     * we always propagate the least PhrasePosition. This allows to base on 
86     * PriorityQueue and move forward faster. 
87     * As result, for example, document "a b c b a"
88     * would score differently for queries "a b c"~4 and "c b a"~4, although 
89     * they really are equivalent. 
90     * Similarly, for doc "a b c b a f g", query "c b"~2 
91     * would get same score as "g f"~2, although "c b"~2 could be matched twice.
92     * We may want to fix this in the future (currently not, for performance reasons).
93     */
94    private float phraseFreq() throws IOException {
95      if (!initPhrasePositions()) {
96        return 0.0f;
97      }
98      float freq = 0.0f;
99      numMatches = 0;
100     PhrasePositions pp = pq.pop();
101     int matchLength = end - pp.position;
102     int next = pq.top().position; 
103     while (advancePP(pp)) {
104       if (hasRpts && !advanceRpts(pp)) {
105         break; // pps exhausted
106       }
107       if (pp.position > next) { // done minimizing current match-length 
108         if (matchLength <= slop) {
109           freq += docScorer.computeSlopFactor(matchLength); // score match
110           numMatches++;
111           if (!needsScores) {
112             return freq;
113           }
114         }      
115         pq.add(pp);
116         pp = pq.pop();
117         next = pq.top().position;
118         matchLength = end - pp.position;
119       } else {
120         int matchLength2 = end - pp.position;
121         if (matchLength2 < matchLength) {
122           matchLength = matchLength2;
123         }
124       }
125     }
126     if (matchLength <= slop) {
127       freq += docScorer.computeSlopFactor(matchLength); // score match
128       numMatches++;
129     }    
130     return freq;
131   }
132 
133   /** advance a PhrasePosition and update 'end', return false if exhausted */
134   private boolean advancePP(PhrasePositions pp) throws IOException {
135     if (!pp.nextPosition()) {
136       return false;
137     }
138     if (pp.position > end) {
139       end = pp.position;
140     }
141     return true;
142   }
143   
144   /** pp was just advanced. If that caused a repeater collision, resolve by advancing the lesser
145    * of the two colliding pps. Note that there can only be one collision, as by the initialization
146    * there were no collisions before pp was advanced.  */
147   private boolean advanceRpts(PhrasePositions pp) throws IOException {
148     if (pp.rptGroup < 0) {
149       return true; // not a repeater
150     }
151     PhrasePositions[] rg = rptGroups[pp.rptGroup];
152     FixedBitSet bits = new FixedBitSet(rg.length); // for re-queuing after collisions are resolved
153     int k0 = pp.rptInd;
154     int k;
155     while((k=collide(pp)) >= 0) {
156       pp = lesser(pp, rg[k]); // always advance the lesser of the (only) two colliding pps
157       if (!advancePP(pp)) {
158         return false; // exhausted
159       }
160       if (k != k0) { // careful: mark only those currently in the queue
161         bits = FixedBitSet.ensureCapacity(bits, k);
162         bits.set(k); // mark that pp2 need to be re-queued
163       }
164     }
165     // collisions resolved, now re-queue
166     // empty (partially) the queue until seeing all pps advanced for resolving collisions
167     int n = 0;
168     // TODO would be good if we can avoid calling cardinality() in each iteration!
169     int numBits = bits.length(); // larges bit we set
170     while (bits.cardinality() > 0) {
171       PhrasePositions pp2 = pq.pop();
172       rptStack[n++] = pp2;
173       if (pp2.rptGroup >= 0 
174           && pp2.rptInd < numBits  // this bit may not have been set
175           && bits.get(pp2.rptInd)) {
176         bits.clear(pp2.rptInd);
177       }
178     }
179     // add back to queue
180     for (int i=n-1; i>=0; i--) {
181       pq.add(rptStack[i]);
182     }
183     return true;
184   }
185 
186   /** compare two pps, but only by position and offset */
187   private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
188     if (pp.position < pp2.position ||
189         (pp.position == pp2.position && pp.offset < pp2.offset)) {
190       return pp;
191     }
192     return pp2;
193   }
194 
195   /** index of a pp2 colliding with pp, or -1 if none */
196   private int collide(PhrasePositions pp) {
197     int tpPos = tpPos(pp);
198     PhrasePositions[] rg = rptGroups[pp.rptGroup];
199     for (int i=0; i<rg.length; i++) {
200       PhrasePositions pp2 = rg[i];
201       if (pp2 != pp && tpPos(pp2) == tpPos) {
202         return pp2.rptInd;
203       }
204     }
205     return -1;
206   }
207 
208   /**
209    * Initialize PhrasePositions in place.
210    * A one time initialization for this scorer (on first doc matching all terms):
211    * <ul>
212    *  <li>Check if there are repetitions
213    *  <li>If there are, find groups of repetitions.
214    * </ul>
215    * Examples:
216    * <ol>
217    *  <li>no repetitions: <b>"ho my"~2</b>
218    *  <li>repetitions: <b>"ho my my"~2</b>
219    *  <li>repetitions: <b>"my ho my"~2</b>
220    * </ol>
221    * @return false if PPs are exhausted (and so current doc will not be a match) 
222    */
223   private boolean initPhrasePositions() throws IOException {
224     end = Integer.MIN_VALUE;
225     if (!checkedRpts) {
226       return initFirstTime();
227     }
228     if (!hasRpts) {
229       initSimple();
230       return true; // PPs available
231     }
232     return initComplex();
233   }
234   
235   /** no repeats: simplest case, and most common. It is important to keep this piece of the code simple and efficient */
236   private void initSimple() throws IOException {
237     //System.err.println("initSimple: doc: "+min.doc);
238     pq.clear();
239     // position pps and build queue from list
240     for (PhrasePositions pp : phrasePositions) {
241       pp.firstPosition();
242       if (pp.position > end) {
243         end = pp.position;
244       }
245       pq.add(pp);
246     }
247   }
248   
249   /** with repeats: not so simple. */
250   private boolean initComplex() throws IOException {
251     //System.err.println("initComplex: doc: "+min.doc);
252     placeFirstPositions();
253     if (!advanceRepeatGroups()) {
254       return false; // PPs exhausted
255     }
256     fillQueue();
257     return true; // PPs available
258   }
259 
260   /** move all PPs to their first position */
261   private void placeFirstPositions() throws IOException {
262     for (PhrasePositions pp : phrasePositions) {
263       pp.firstPosition();
264     }
265   }
266 
267   /** Fill the queue (all pps are already placed */
268   private void fillQueue() {
269     pq.clear();
270     for (PhrasePositions pp : phrasePositions) {  // iterate cyclic list: done once handled max
271       if (pp.position > end) {
272         end = pp.position;
273       }
274       pq.add(pp);
275     }
276   }
277 
278   /** At initialization (each doc), each repetition group is sorted by (query) offset.
279    * This provides the start condition: no collisions.
280    * <p>Case 1: no multi-term repeats<br>
281    * It is sufficient to advance each pp in the group by one less than its group index.
282    * So lesser pp is not advanced, 2nd one advance once, 3rd one advanced twice, etc.
283    * <p>Case 2: multi-term repeats<br>
284    * 
285    * @return false if PPs are exhausted. 
286    */
287   private boolean advanceRepeatGroups() throws IOException {
288     for (PhrasePositions[] rg: rptGroups) { 
289       if (hasMultiTermRpts) {
290         // more involved, some may not collide
291         int incr;
292         for (int i=0; i<rg.length; i+=incr) {
293           incr = 1;
294           PhrasePositions pp = rg[i];
295           int k;
296           while((k=collide(pp)) >= 0) {
297             PhrasePositions pp2 = lesser(pp, rg[k]);
298             if (!advancePP(pp2)) {  // at initialization always advance pp with higher offset
299               return false; // exhausted
300             }
301             if (pp2.rptInd < i) { // should not happen?
302               incr = 0;
303               break;
304             }
305           }
306         }
307       } else {
308         // simpler, we know exactly how much to advance
309         for (int j=1; j<rg.length; j++) {
310           for (int k=0; k<j; k++) {
311             if (!rg[j].nextPosition()) {
312               return false; // PPs exhausted
313             }
314           }
315         }
316       }
317     }
318     return true; // PPs available
319   }
320   
321   /** initialize with checking for repeats. Heavy work, but done only for the first candidate doc.<p>
322    * If there are repetitions, check if multi-term postings (MTP) are involved.<p>
323    * Without MTP, once PPs are placed in the first candidate doc, repeats (and groups) are visible.<br>
324    * With MTP, a more complex check is needed, up-front, as there may be "hidden collisions".<br>
325    * For example P1 has {A,B}, P1 has {B,C}, and the first doc is: "A C B". At start, P1 would point
326    * to "A", p2 to "C", and it will not be identified that P1 and P2 are repetitions of each other.<p>
327    * The more complex initialization has two parts:<br>
328    * (1) identification of repetition groups.<br>
329    * (2) advancing repeat groups at the start of the doc.<br>
330    * For (1), a possible solution is to just create a single repetition group, 
331    * made of all repeating pps. But this would slow down the check for collisions, 
332    * as all pps would need to be checked. Instead, we compute "connected regions" 
333    * on the bipartite graph of postings and terms.  
334    */
335   private boolean initFirstTime() throws IOException {
336     //System.err.println("initFirstTime: doc: "+min.doc);
337     checkedRpts = true;
338     placeFirstPositions();
339 
340     LinkedHashMap<Term,Integer> rptTerms = repeatingTerms(); 
341     hasRpts = !rptTerms.isEmpty();
342 
343     if (hasRpts) {
344       rptStack = new PhrasePositions[numPostings]; // needed with repetitions
345       ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
346       sortRptGroups(rgs);
347       if (!advanceRepeatGroups()) {
348         return false; // PPs exhausted
349       }
350     }
351     
352     fillQueue();
353     return true; // PPs available
354   }
355 
356   /** sort each repetition group by (query) offset. 
357    * Done only once (at first doc) and allows to initialize faster for each doc. */
358   private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
359     rptGroups = new PhrasePositions[rgs.size()][];
360     Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
361       @Override
362       public int compare(PhrasePositions pp1, PhrasePositions pp2) {
363         return pp1.offset - pp2.offset;
364       }
365     };
366     for (int i=0; i<rptGroups.length; i++) {
367       PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
368       Arrays.sort(rg, cmprtr);
369       rptGroups[i] = rg;
370       for (int j=0; j<rg.length; j++) {
371         rg[j].rptInd = j; // we use this index for efficient re-queuing
372       }
373     }
374   }
375 
376   /** Detect repetition groups. Done once - for first doc */
377   private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException {
378     PhrasePositions[] rpp = repeatingPPs(rptTerms); 
379     ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>();
380     if (!hasMultiTermRpts) {
381       // simpler - no multi-terms - can base on positions in first doc
382       for (int i=0; i<rpp.length; i++) {
383         PhrasePositions pp = rpp[i];
384         if (pp.rptGroup >=0) continue; // already marked as a repetition
385         int tpPos = tpPos(pp);
386         for (int j=i+1; j<rpp.length; j++) {
387           PhrasePositions pp2 = rpp[j];
388           if (
389               pp2.rptGroup >=0        // already marked as a repetition
390               || pp2.offset == pp.offset // not a repetition: two PPs are originally in same offset in the query! 
391               || tpPos(pp2) != tpPos) {  // not a repetition
392             continue; 
393           }
394           // a repetition
395           int g = pp.rptGroup;
396           if (g < 0) {
397             g = res.size();
398             pp.rptGroup = g;  
399             ArrayList<PhrasePositions> rl = new ArrayList<>(2);
400             rl.add(pp);
401             res.add(rl); 
402           }
403           pp2.rptGroup = g;
404           res.get(g).add(pp2);
405         }
406       }
407     } else {
408       // more involved - has multi-terms
409       ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>();
410       ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms);
411       unionTermGroups(bb);
412       HashMap<Term,Integer> tg = termGroups(rptTerms, bb);
413       HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values());
414       for (int i=0; i<distinctGroupIDs.size(); i++) {
415         tmp.add(new HashSet<PhrasePositions>());
416       }
417       for (PhrasePositions pp : rpp) {
418         for (Term t: pp.terms) {
419           if (rptTerms.containsKey(t)) {
420             int g = tg.get(t);
421             tmp.get(g).add(pp);
422             assert pp.rptGroup==-1 || pp.rptGroup==g;  
423             pp.rptGroup = g;
424           }
425         }
426       }
427       for (HashSet<PhrasePositions> hs : tmp) {
428         res.add(new ArrayList<>(hs));
429       }
430     }
431     return res;
432   }
433 
434   /** Actual position in doc of a PhrasePosition, relies on that position = tpPos - offset) */
435   private final int tpPos(PhrasePositions pp) {
436     return pp.position + pp.offset;
437   }
438 
439   /** find repeating terms and assign them ordinal values */
440   private LinkedHashMap<Term,Integer> repeatingTerms() {
441     LinkedHashMap<Term,Integer> tord = new LinkedHashMap<>();
442     HashMap<Term,Integer> tcnt = new HashMap<>();
443     for (PhrasePositions pp : phrasePositions) {
444       for (Term t : pp.terms) {
445         Integer cnt0 = tcnt.get(t);
446         Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue());
447         tcnt.put(t, cnt);
448         if (cnt==2) {
449           tord.put(t,tord.size());
450         }
451       }
452     }
453     return tord;
454   }
455 
456   /** find repeating pps, and for each, if has multi-terms, update this.hasMultiTermRpts */
457   private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) {
458     ArrayList<PhrasePositions> rp = new ArrayList<>();
459     for (PhrasePositions pp : phrasePositions) {
460       for (Term t : pp.terms) {
461         if (rptTerms.containsKey(t)) {
462           rp.add(pp);
463           hasMultiTermRpts |= (pp.terms.length > 1);
464           break;
465         }
466       }
467     }
468     return rp.toArray(new PhrasePositions[0]);
469   }
470   
471   /** bit-sets - for each repeating pp, for each of its repeating terms, the term ordinal values is set */
472   private ArrayList<FixedBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) {
473     ArrayList<FixedBitSet> bb = new ArrayList<>(rpp.length);
474     for (PhrasePositions pp : rpp) {
475       FixedBitSet b = new FixedBitSet(tord.size());
476       Integer ord;
477       for (Term t: pp.terms) {
478         if ((ord=tord.get(t))!=null) {
479           b.set(ord);
480         }
481       }
482       bb.add(b);
483     }
484     return bb;
485   }
486   
487   /** union (term group) bit-sets until they are disjoint (O(n^^2)), and each group have different terms */
488   private void unionTermGroups(ArrayList<FixedBitSet> bb) {
489     int incr;
490     for (int i=0; i<bb.size()-1; i+=incr) {
491       incr = 1;
492       int j = i+1;
493       while (j<bb.size()) {
494         if (bb.get(i).intersects(bb.get(j))) {
495           bb.get(i).or(bb.get(j));
496           bb.remove(j);
497           incr = 0;
498         } else {
499           ++j;
500         }
501       }
502     }
503   }
504   
505   /** map each term to the single group that contains it */ 
506   private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<FixedBitSet> bb) throws IOException {
507     HashMap<Term,Integer> tg = new HashMap<>();
508     Term[] t = tord.keySet().toArray(new Term[0]);
509     for (int i=0; i<bb.size(); i++) { // i is the group no.
510       FixedBitSet bits = bb.get(i);
511       for (int ord = bits.nextSetBit(0); ord != DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 >= bits.length() ? DocIdSetIterator.NO_MORE_DOCS : bits.nextSetBit(ord + 1)) {
512         tg.put(t[ord],i);
513       }
514     }
515     return tg;
516   }
517 
518   @Override
519   public int freq() {
520     return numMatches;
521   }
522 
523   float sloppyFreq() {
524     return sloppyFreq;
525   }
526   
527 //  private void printQueue(PrintStream ps, PhrasePositions ext, String title) {
528 //    //if (min.doc != ?) return;
529 //    ps.println();
530 //    ps.println("---- "+title);
531 //    ps.println("EXT: "+ext);
532 //    PhrasePositions[] t = new PhrasePositions[pq.size()];
533 //    if (pq.size()>0) {
534 //      t[0] = pq.pop();
535 //      ps.println("  " + 0 + "  " + t[0]);
536 //      for (int i=1; i<t.length; i++) {
537 //        t[i] = pq.pop();
538 //        assert t[i-1].position <= t[i].position;
539 //        ps.println("  " + i + "  " + t[i]);
540 //      }
541 //      // add them back
542 //      for (int i=t.length-1; i>=0; i--) {
543 //        pq.add(t[i]);
544 //      }
545 //    }
546 //  }
547   
548   
549   @Override
550   public int docID() {
551     return conjunction.docID(); 
552   }
553 
554   @Override
555   public int nextDoc() throws IOException {
556     int doc;
557     for (doc = conjunction.nextDoc(); doc != NO_MORE_DOCS; doc = conjunction.nextDoc()) {
558       sloppyFreq = phraseFreq(); // check for phrase
559       if (sloppyFreq != 0f) {
560         break;
561       }
562     }
563 
564     return doc;
565   }
566   
567   @Override
568   public float score() {
569     return docScorer.score(docID(), sloppyFreq);
570   }
571 
572   @Override
573   public int advance(int target) throws IOException {
574     assert target > docID();
575     int doc;
576     for (doc = conjunction.advance(target); doc != NO_MORE_DOCS; doc = conjunction.nextDoc()) {
577       sloppyFreq = phraseFreq(); // check for phrase
578       if (sloppyFreq != 0f) {
579         break;
580       }
581     }
582 
583     return doc;
584   }
585 
586   @Override
587   public long cost() {
588     return conjunction.cost();
589   }
590 
591   @Override
592   public String toString() { return "scorer(" + weight + ")"; }
593 
594   @Override
595   public TwoPhaseIterator asTwoPhaseIterator() {
596     return new TwoPhaseIterator(conjunction) {
597       @Override
598       public boolean matches() throws IOException {
599         sloppyFreq = phraseFreq(); // check for phrase
600         return sloppyFreq != 0F;
601       }
602 
603       @Override
604       public float matchCost() {
605         return matchCost;
606       }
607 
608       @Override
609       public String toString() {
610         return "SloppyPhraseScorer@asTwoPhaseIterator(" + SloppyPhraseScorer.this + ")";
611       }
612     };
613   }
614 }